import spotipy
import spotipy.util as util
from spotipy.oauth2 import SpotifyClientCredentials
import json
import requests
import time
import sys
from math import log
from pprint import pprint
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode,plot,iplot
import plotly.figure_factory as ff
import chart_studio.tools as tls
import chart_studio.plotly as py
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
with open('MyData/StreamingHistory0.json', encoding='utf8') as f:
jsondata1 = json.load(f)
with open('MyData/StreamingHistory1.json', encoding='utf8') as f:
jsondata2 = json.load(f)
jsondata = jsondata1 + jsondata2
my_streaming = pd.DataFrame()
def extract_json_value(column_name,data):
return [i[column_name] for i in data]
# For each streaming instance:
# Track Name
# Artist Name
# Timestamp of End Time listening to that track stream
# Milliseconds listened in that instance
my_streaming['track_name'] = extract_json_value('trackName',jsondata)
my_streaming['artist_name'] = extract_json_value('artistName',jsondata)
my_streaming['end_time'] = extract_json_value('endTime',jsondata)
my_streaming['ms_played'] = extract_json_value('msPlayed',jsondata)
1) Go to https://developer.spotify.com/dashboard/applications
2) Create an App
3) Name your App
4) Go to App --> Find Client_ID & Client Secret
username = 'shahv1057'
# Copy-paste previously found client details
client_id ='d91dbe3fe689448aa5203a4b639c2a2f'
client_secret = 'a9fbd58047704e55a4abf4469fd66e2e'
# This can be any localhost site
redirect_uri = 'http://localhost:1234/callback'
scope = 'user-read-recently-played'
# Running this cell will open a prompt at 'redirect_uri', click 'agree' to authorize and connect to API
token = util.prompt_for_user_token(username=username,
scope=scope,
client_id=client_id,
client_secret=client_secret,
redirect_uri=redirect_uri)
def get_id(track_name,artist, token):
'''
Input: Track Name, Artist Name, and API token
Output: Spotify's unique Track ID for that track
'''
headers = {
'Accept': 'application/json',
'Content-Type': 'application/json',
'Authorization': f'Bearer ' + token,
}
trackandartist = track_name+ " " + artist
params = [
#q is the search query parameter
('q',trackandartist ),
('type', 'track'),
]
try:
response = requests.get('https://api.spotify.com/v1/search',
headers = headers, params = params, timeout = 10)
json = response.json()
track_id = json['tracks']['items'][0]['id']
return track_id
except:
return None
# This may take several minutes, up to close to an hour mattering on your listening history
my_streaming["track_id"] = my_streaming.apply(lambda x: get_id(x["track_name"],x["artist_name"],token),axis=1)
trackid = list(my_streaming["track_id"].dropna().unique())
my_features = pd.DataFrame(columns=[
"track_id","energy","tempo","speechiness",
"acousticness","instrumentalness","danceability",
"loudness","valence"
])
# Authorize access to audio features
client_credentials_manager = SpotifyClientCredentials(client_id=client_id,
client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager,requests_timeout=100)
# For each Track ID in my Spotify-provided listening history...
# import spotify's audio features into my_features DataFrame
for track in trackid:
print (track)
features = sp.audio_features(tracks = [track])[0]
if features is not None:
my_features = my_features.append({
"track_id":track,
"energy":features['energy'],
"tempo":features['tempo'],
"speechiness":features['speechiness'],
"acousticness":features['acousticness'],
"instrumentalness":features['instrumentalness'],
"danceability":features['danceability'],
"loudness":features['loudness'],
"valence":features['valence'],
},ignore_index=True)
my_features = my_features.merge(my_streaming[['track_id','track_name','artist_name']],how="left",left_on= "track_id", right_on="track_id").drop_duplicates()
def acquire_album(track_id):
return sp.track(track_id)['album']['name']
my_features['album'] = my_features['track_id'].apply(acquire_album)
my_streaming = my_streaming.merge(my_features[['track_id','album']],how="left",left_on= "track_id", right_on="track_id").drop_duplicates()
my_streaming.to_pickle('my_streaming.pkl')
my_features.to_pickle('my_features.pkl')
songs = my_streaming.copy()
songs['month_year'] = pd.to_datetime(songs['end_time']).dt.to_period('M')
songs['ms_played'] = songs['ms_played'] / 60000
top20songs = songs.groupby('track_name')['ms_played'].sum().sort_values(ascending=False).nlargest(20)
songs = songs[songs['track_name'].isin(top20songs.index)]
plotly_songs_df = songs.groupby(['track_name','artist_name','month_year'])['ms_played'].sum().reset_index()
plotly_songs_df['month_year'] = plotly_songs_df['month_year'].astype(str)
plotly_songs_df['ms_played'] = plotly_songs_df['ms_played'].round()
months_order = ['2019-06', '2019-07', '2019-08','2019-09','2019-10','2019-11','2019-12',
'2020-01', '2020-02', '2020-03', '2020-04','2020-05', '2020-06']
colors = ["Black","#240011","#480020","#6D002E","#91003A","#B30046","#D10550","#EC0E5B","#E3416A","#DE7082","#DE999E","#E3BFBE","#ECDFDE"][::-1]
labels={"month_year": "Month", "track_name": "Song", "ms_played": "Minutes Listening",'artist_name': 'Artist'}
fig = px.bar(plotly_songs_df,
x='track_name',
y='ms_played',
hover_data=['track_name','artist_name','month_year','ms_played'],
opacity=.8,
title = '(Last 12 Months)',
color='month_year',
labels=labels,
category_orders={"month_year": months_order},
color_discrete_sequence=colors
)
fig.update_traces(marker_line_width=0,marker_line_color='black')
fig.show()
artists = my_streaming.copy()
artists['month_year'] = pd.to_datetime(artists['end_time']).dt.to_period('M')
artists['ms_played'] = artists['ms_played'] / 60000
top20artists = artists.groupby('artist_name')['ms_played'].sum().sort_values(ascending=False).nlargest(20)
artists = artists[artists['artist_name'].isin(top20artists.index)]
plotly_artists_df = artists.groupby(['track_name','artist_name','month_year'])['ms_played'].sum().reset_index()
plotly_artists_df['month_year'] = plotly_artists_df['month_year'].astype(str)
plotly_artists_df['ms_played'] = plotly_artists_df['ms_played'].round()
months_order = ['2019-06', '2019-07', '2019-08','2019-09','2019-10','2019-11','2019-12',
'2020-01', '2020-02', '2020-03', '2020-04','2020-05', '2020-06']
colors = ['rgb(237,248,251)','rgb(237,248,251)','rgb(204,236,230)','rgb(204,236,230)',
'rgb(153,216,201)','rgb(153,216,201)','rgb(102,194,164)','rgb(102,194,164)',
'rgb(44,162,95)','rgb(44,162,95)','rgb(0,109,44)','rgb(0,109,44)','rgb(0,85,23)']
labels={"month_year": "Month", "track_name": "Song", "ms_played": "Minutes Listening",'artist_name': 'Artist'}
fig = px.bar(plotly_artists_df,
x='artist_name',
y='ms_played',
hover_data=['track_name','artist_name','month_year','ms_played'],
opacity=.8,
title = '(Last 12 Months)',
color='month_year',
labels=labels,
category_orders={"month_year": months_order},
color_discrete_sequence=colors
)
fig.update_traces(marker_line_width=0,marker_line_color='black')
fig.show()
albums = my_streaming.copy()
albums['month_year'] = pd.to_datetime(albums['end_time']).dt.to_period('M')
albums['ms_played'] = albums['ms_played'] / 60000
top20albums = albums.groupby('album')['ms_played'].sum().sort_values(ascending=False).nlargest(20)
albums = albums[albums['album'].isin(top20albums.index)]
plotly_albums_df = albums.groupby(['album','track_name','artist_name','month_year'])['ms_played'].sum().reset_index()
plotly_albums_df['month_year'] = plotly_albums_df['month_year'].astype(str)
plotly_albums_df['ms_played'] = plotly_albums_df['ms_played'].round()
months_order = ['2019-06', '2019-07', '2019-08','2019-09','2019-10','2019-11','2019-12',
'2020-01', '2020-02', '2020-03', '2020-04','2020-05', '2020-06']
colors = ['rgb(254,240,217)','rgb(254,240,217)','rgb(253,212,158)','rgb(253,212,158)',
'rgb(253,187,132)','rgb(253,187,132)','rgb(252,141,89)','rgb(252,141,89)',
'rgb(227,74,51)','rgb(227,74,51)','rgb(179,0,0)','rgb(179,0,0)','rgb(110,0,0)']
labels={"month_year": "Month", "track_name": "Song", "ms_played": "Minutes Listening",'artist_name': 'Artist','album':'Album'}
fig = px.bar(plotly_albums_df,
x='album',
y='ms_played',
hover_data=['album','track_name','artist_name','month_year','ms_played'],
opacity=.8,
title = '(Last 12 Months)',
color='month_year',
labels=labels,
category_orders={"month_year": months_order},
color_discrete_sequence=colors
)
fig.update_traces(marker_line_width=0,marker_line_color='black')
fig.show()
minplayed = (my_streaming.groupby('track_id')['ms_played'].sum()/60000).reset_index()
listens15 = minplayed[minplayed.ms_played > 13]
song_prefs = listens15.merge(my_features,how="left",left_on= "track_id", right_on="track_id").drop_duplicates()
song_prefs = song_prefs[['track_id','track_name','artist_name','album','energy', 'tempo','speechiness', 'acousticness', 'instrumentalness', 'danceability', 'loudness', 'valence']].dropna()
X = song_prefs.drop(['track_id','track_name','artist_name','album'],axis=1)
scaler = MinMaxScaler()
X.loudness = scaler.fit_transform(X.loudness.values.reshape(-1,1))
X.tempo = scaler.fit_transform(X.tempo.values.reshape(-1,1))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(X.corr(),cmap=cmap)
distplot_data = [list(X[feature]) for feature in X.columns]
group_labels = list(X.columns)
fig = ff.create_distplot(distplot_data,
group_labels,
bin_size=.02,
show_hist=False
)
fig.update_layout({"template": 'plotly_white'})
fig.update_yaxes(range=[0, 7])
fig.show()
inertia = {}
for n in range(1,15):
kmeans = KMeans(n_clusters=n, random_state=1,n_jobs=-1).fit(X.values)
inertia[n] = kmeans.inertia_
cluster_num = list(inertia.keys())
inertia_vals = list(inertia.values())
fig = go.Figure()
fig.add_trace(go.Scatter(x=cluster_num, y=inertia_vals,
mode='lines+markers'))
elbow = [dict(type="circle",
xref="x", yref="y",
x0=cluster_num[3]-.4, y0=inertia_vals[3]-5,
x1 = cluster_num[3]+.4, y1 = inertia_vals[3]+5,
line=dict(color="Red"))]
fig.update_layout(xaxis_title="Number of Clusters",
yaxis_title="Inertia",
updatemenus=[
dict(
type="buttons",
buttons=[
dict(label="None",
method="relayout",
args=["shapes", []]),
dict(label="Elbow",
method="relayout",
args=["shapes", elbow])
])])
config = {'displayModeBar': False}
fig.show(config=config)
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters,n_jobs=-1,random_state=1).fit(X.values)
y_kmeans = kmeans.predict(X.values)
pca_2D = PCA(n_components=2)
principal_components_2D = pca_2D.fit_transform(X.values)
pc2D = pd.DataFrame(principal_components_2D)
pc2D['label'] = [str(y) for y in y_kmeans]
pc2D.columns = ['x', 'y','label']
fig = px.scatter(pc2D,
x='x',
y='y',
color='label',
color_discrete_map={'0':'purple','1':'blue','2':'green','3':'red'},
category_orders={"label": ["0", "1", "2", "3"]})
print (pca_2D.explained_variance_ratio_, sum(pca_2D.explained_variance_ratio_))
from mpl_toolkits.mplot3d import Axes3D
pca_3D = PCA(n_components=3)
principal_components_3D = pca_3D.fit_transform(X.values)
pc3D = pd.DataFrame(principal_components_3D)
pc3D['label'] = [str(y) for y in y_kmeans]
pc3D.columns = ['x', 'y', 'z', 'label']
fig = px.scatter_3d(pc3D,
x='x',
y='y',
z='z',
color='label',
color_discrete_map={'0':'purple','1':'blue','2':'green','3':'red'},
category_orders={"label": ["0", "1", "2", "3"]})
print (pca_3D.explained_variance_ratio_, sum(pca_3D.explained_variance_ratio_))
song_prefs['label'] = y_kmeans
# shuffle dataset
fig = sns.barplot(x=song_prefs['label'].value_counts().index,
y=song_prefs['label'].value_counts()
)
plt.title('# of Songs in each Group')
plt.ylabel('')
fig = fig.get_figure()
fig.set_size_inches(10, 4)
fig.show()
scaler = StandardScaler()
sns.set(font_scale=1.6,font='Times New Roman')
fig = sns.heatmap(scaler.fit_transform(song_prefs.groupby('label').mean()).T,
cmap='coolwarm',
yticklabels=[x.capitalize() for x in list(X.columns)],
annot=True)
fig = fig.get_figure()
fig.set_size_inches(16, 8)
scaler = StandardScaler()
Xtrain, Xtest, ytrain, ytest = train_test_split(scaler.fit_transform(X.values),y_kmeans,test_size =.25,random_state=1)
clf = RandomForestClassifier(n_estimators=30, random_state=10,criterion='entropy')
clf.fit(Xtrain, ytrain)
ypred = clf.predict(Xtest)
print ([(x,y) for x,y in zip(list(np.round(clf.feature_importances_,2)),group_labels)])
moods = ['Hype','Angsty','Happy',"Sad"]
classification_matrix = np.zeros((4,4))
for x,y in zip(ytest,ypred):
classification_matrix[x,y]+=1
ax = sns.heatmap(classification_matrix,
cmap='Blues',
cbar=False,
annot=True,
xticklabels = moods,
yticklabels= moods)
ax.set(xlabel='Preds', ylabel='True')
plt.show()
from spotipy.oauth2 import SpotifyOAuth
scope = 'playlist-modify-public'
token = util.prompt_for_user_token(username=username,
scope=scope,
client_id=client_id,
client_secret=client_secret,
redirect_uri=redirect_uri)
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id,client_secret,redirect_uri,scope=scope,username=username))
minplayed = (my_streaming.groupby('track_id')['ms_played'].sum()/60000).reset_index()
minutes_bar = 10
listens_mins = minplayed[minplayed.ms_played > minutes_bar]
song_prefs = listens_mins.merge(my_features,
how="left",
left_on= "track_id",
right_on="track_id").drop_duplicates().reset_index(drop=True)
minplayed = (my_streaming.groupby('track_id')['ms_played'].sum()/60000).reset_index()
minutes_bar = 10
listens_mins = minplayed[minplayed.ms_played > minutes_bar]
song_prefs = listens_mins.merge(my_features,
how="left",
left_on= "track_id",
right_on="track_id").drop_duplicates().reset_index(drop=True)
song_prefs = song_prefs[['track_id','track_name','artist_name','album','energy', 'tempo','speechiness', 'acousticness', 'instrumentalness', 'danceability', 'loudness', 'valence']].dropna()
X = song_prefs.drop(['track_id','track_name','artist_name','album'],axis=1)
scaler = MinMaxScaler()
X.loudness = scaler.fit_transform(X.loudness.values.reshape(-1,1))
X.tempo = scaler.fit_transform(X.tempo.values.reshape(-1,1))
y_kmeans = kmeans.predict(X.values)
song_prefs['label'] = y_kmeans
def create_mood_playlists(moods, df, num_clusters, playlist_length):
for moodnum in range(num_clusters):
data = df[df.label==moodnum]
sp.user_playlist_create(username, moods[moodnum])
playlist_id = sp.user_playlists(username)['items'][0]['id']
playlist_song_IDs = list(data['track_id'].sample(playlist_length))
sp.user_playlist_add_tracks(username, playlist_id, list(playlist_song_IDs))
moods = ['Sad','Happy','Angsty',"Hype"]
num_clusters = 4
playlist_length = 20
create_mood_playlists(moods, song_prefs, num_clusters, playlist_length)